###############################################################################-
# Created By: Pietryka
# Creation Date:  December 30, 2020
# Purpose: Fit random forest models to evaluate variable importance
# Contact: mpietryka@fsu.edu
###############################################################################-


#  1. LOAD PACKAGES & DATA =======================================

## 1A. LOAD PACKAGES    -----------------------------
library(tidyverse)   # DATA CLEANING FUNCTIONS
library(lme4)        # MULTILEVEL MODELS
library(texreg)      # DISPLAY MODELS
library(lubridate)   # DATES
library(tidymodels)  # VALIDATION
library(vip)         # variable importance plots
library(patchwork)

# parallel processing
library(parallel)
cores <- parallel::detectCores() - 1
cores


# plot preferences
source("SC-Plots-Preferences.R")


## 1B. LOAD DATA    -----------------------------



# 'dyads_df' OBJECT CREATED IN '1-clean-the-data/SC-1- Dyadic Data.R'
dyads_df <- read_rds("../Data/Derived/dyads_df.rds")
attr(dyads_df, "source")


# SOURCE (source_from): INNOVATIVE (new)
# FOCAL (source_to):    FULL TEXT

# 'new_to_full_df' OBJECT CREATED IN
# '1-clean-the-data/SC-4- Measure Similarities.R'
df_all <- read_rds("../Data/Derived/new_to_full_df.rds")   %>%
  left_join(dyads_df)  %>%
  # FOCUS ON RELEVANT YEARS
  filter(date_to > date_from) %>%
  # POST-1776
  filter(year_to >= 1776)  %>%
  # POST CIVIL WAR
  mutate(postwar_to = date_to >= ymd("1861-04-12"))  %>%
  # recode party same
  mutate(party_same = factor(party_same))

# prewar data
df_pre  <- df_all  %>% filter(postwar_to == FALSE)

# postwar data
df_post <- df_all  %>% filter(postwar_to == TRUE)

df_list <- list(df_all, df_pre, df_post)

# 2. IDENTIFY VARIABLES FOR THE MODEL ========================
outcome <- "ratio"

predictors <- c(
  #"share_border", don't need since it is function of distance
  "distance",
  "distance_rel",
  "both_south",
  "same_state",
  "us_from",
  "year_to",
  "year_from",
  "time_diff",
  "party_same"
)

#  3. PREPARE DATA ===================================

## 3A. split data for test, training, validation --------

set.seed(45543)
splits  <- map(df_list, ~.x  %>%
  select(one_of(outcome), one_of(predictors), postwar_to)  %>%
  initial_split())

df_other <- map(splits, training)
df_test  <- map(splits, testing)

set.seed(436456)
val_set <- map(df_other, validation_split, prop = 0.80)




## 3B. specify the model -----------------

rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_engine("ranger", num.threads = cores) %>%
  set_mode("regression")

rf_recipe <- map(
  df_other,
  ~ recipe(ratio ~ ., data = .x) %>% step_rm(postwar_to)
)




## 3C. Create the workflow ------------

rf_workflow <- map(
  rf_recipe,
  ~ workflow() %>% add_model(rf_mod) %>% add_recipe(.x)
  )



# 4. train and tune the model ===================

set.seed(3445475)
rf_res <- map2(rf_workflow, val_set, tune_grid, grid = 25,
               control = control_grid(save_pred = TRUE),
               metrics = metric_set(rmse, rsq, ccc))



## 4A. find best performing model parameters --------------------

a <- rf_res %>%
  map_df(show_best, metric = "rmse", .id = "group")

b <- rf_res %>%
  map_df(show_best, metric = "ccc", .id = "group")

c <- rf_res %>%
  map_df(show_best, metric = "rsq", .id = "group")

a  %>%
  inner_join(b, by = c("group",  "mtry", "min_n" )) %>%
  inner_join(c, by = c("group",  "mtry", "min_n" ))

# We choose model with the lowest rmse and highest r2 for the full data series


## 4B. last fit -------------

last_rf_mod <- rand_forest(mtry = 6, min_n = 15, trees = 1000) %>%
  set_engine(
    "ranger",
    num.threads = cores,
    importance = "permutation") %>%
  set_mode("regression")

# the last workflow
last_rf_workflow <- map(rf_workflow, update_model, last_rf_mod)

# the last fit
set.seed(207765)
last_rf_fit <- map2(last_rf_workflow, splits, last_fit)

last_rf_fit

map(last_rf_fit, collect_metrics)

# Extract importance scores

vi_df <- map(last_rf_fit, pluck, ".workflow", 1)  %>%
  map(pull_workflow_fit) %>%
  map_df(vi,  .id = "period", scale = TRUE)


# 5. Plot results =====================


## 5A. plotting settings --------------

var_labels <- list(
  same_state         = "Same-state dyad",
  us_from            = "US is the source",
  distance           = "Absolute distance",
  distance_rel       = "Relative distance",
  both_south         = "Southern dyad",
  year_to            = "Year focal constitution was ratified",
  year_from          = "Year source constitution was ratified",
  time_diff         =  "Temporal difference",
  party_same         = "Partisan congruence"
)

period_labs <- list(
  `1` = "A. 1776-1859",
  `2` = "B. 1862-1986",
  `3` = "C. 1776-1986"
)

## 5B. creaate plot --------------------

rf_plot <- vi_df  %>%
  mutate(period = as.integer(period))  %>%
  mutate(period_labs = recode(period, !!!period_labs))  %>%
  mutate(Variable = recode(Variable, !!!var_labels))  %>%
  group_by(-period)  %>%
  mutate(Variable = fct_reorder(Variable, Importance))  %>%
  ggplot(aes(x = Variable, y = Importance
)) +
  geom_segment(aes(
    x = Variable,
    xend = Variable,
    y = 0,
    yend = Importance,
  ),
  size = 0.9) +
  geom_point(size = 4) +
  #print value for each bar as well
  facet_wrap( ~ period_labs) +
  coord_flip() +
  theme_sc(
    axis_title_just	= "c",
    plot_margin = margin(5, 5, 5, 5),
    axis_title_size = 13,
    axis_text_size = 12
    ) +
  xlab(NULL) +
  ylab("Importance") +
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
    )

graphics.off()
windows(9, 3.5)
rf_plot



# 6. SAVE  =======================

graphics.off()
windows(9, 3.5)
rf_plot


ggsave("Plots/random-forest.png")


# DISPLAY VERSION NUMBERS FOR R & PACKAGES IN USE
sessionInfo()

